% reference: Incomplete Label Uncertainty Estimation for Petition Victory Prediction with Dynamic Features
% Junxiang Wang, Yuyang Gao, Andreas Z?ufle, Jingyuan Yang and Liang Zhao. International Conference on Data Mining(ICDM 2018).
% @misc{Wang2018,
%    author  = "Junxiang Wang, Yuyang Gao, Andreas Z?ufle, Jingyuan Yang and Liang Zhao",
%    title   = "Incomplete Label Uncertainty Estimation for Petition Victory Prediction with Dynamic Features",
%    conference = "Proceedings of International Conference on Data Mining",
%    year    = "2018",
%    month   = "nov"
%}
% contact Junxiang Wang(jwang40.gmu.edu)
clear;
% There are petition datasets for six countries in the ./data subfolder:
% AU: Australia
% CA: Canada
% GB: German
% IN: India
% PH: Philippines
% US: United States 
% Each dataset consists of the following components:
% data: n * m petition matrix. -100 means this value is missing.
% ID: n * 1 vector. the IDs of petitions.
% index: n * 1 vector. the mapping from petition data to petition sets.
% label: n * 1 vector. labels of peititons. 1 means this petition is victorious, -1 means this petition fails and 0
% means this petition is unlabeled.
% taskno: n* 1 vector. the task number assigned for multi-task learning according to
% missing patterns.
% where
% n: the number of labeled petition observations.
% m: number of features.
% The petition features include basic petition properties and word counts
% from comments left for these petitions.
% load dataset. For example, if you want to use the
% Australia dataset, replace  the string "PH" with "AU" in the code.
load('.\data\PH.mat');
data=PH_data;
label=PH_label;
taskno=PH_taskno;
index=PH_index;
ID=PH_ID;
clear PH_data PH_ID PH_index PH_label PH_taskno
task_num=max(taskno);
fea_num=size(data,2);
% generate increasing feature blocks according to missing patterns.
IFB=cell(1,task_num);
for i=1:task_num
    temp=data(taskno==i,:);
    IFB{i}=find(temp(1,:)~=-100);
end
% data preprocessing
data(:,75)=log(data(:,75))/log(10);
flag=data~=-100;
for i=1:size(data,2)
    if(max(data(flag(:,i),i))==min(data(flag(:,i),i)))
        data(flag(:,i),i)=0;
    else
        data(flag(:,i),i)=(data(flag(:,i),i)-min(data(flag(:,i),i)))/(max(data(flag(:,i),i))-min(data(flag(:,i),i)));
    end
end
% fill missing values with 0.
data(data==-100)=0;
petition_num=max(index);
data_num=length(index);
% record the time order for each petition
timeorder=zeros(data_num,1);
k=1;
for i=1:data_num
    petition_timelength=sum(index==i);
    for j=1:petition_timelength
    timeorder(k)=j;
    k=k+1;
    end
end
% label=0 means unlabeled petitions
unlabel_data.data=data(label==0,:);
unlabel_data.taskno=taskno(label==0);
unlabel_data.index=index(label==0);
unlabel_data.timeorder=timeorder(label==0);
data(label==0,:)=[];
taskno(label==0)=[];
index(label==0)=[];
timeorder(label==0)=[];
label(label==0)=[];
label_data.data=data;
label_data.label=label;
label_data.taskno=taskno;
label_data.index=index;
label_data.timeorder=timeorder;
% intialize the MLUE model by a pretrained SVM classifier
svm=cell(task_num,1);
beta=zeros(fea_num,task_num);
b=zeros(1,task_num);
for i=1:task_num
    temp_data=label_data.data(label_data.taskno==i,:);
    temp_label=label_data.label(label_data.taskno==i);
    svm{i}=fitcsvm(temp_data,temp_label,'KernelFunction','linear');
    beta(:,i)=svm{i}.Beta;
    b(i)=svm{i}.Bias;
end
lambda1=0.01;
lambda2=0.01;
% apply the MLUE model to predict the unlabeled petitions
    [prediction,beta,b]=MLUE(label_data,unlabel_data,lambda1,lambda2,IFB,beta,b);
    pred=sign(prediction);